// HTMLParser Library $Name: v1_6 $ - A java-based parser for HTML
// http://sourceforge.org/projects/htmlparser
// Copyright (C) 2004 Derrick Oswald
//
// Revision Control Information
//
// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/beans/FilterBean.java,v $
// $Author: derrickoswald $
// $Date: 2005/09/18 23:40:44 $
// $Revision: 1.4 $
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//

package org.htmlparser.beans;

import java.beans.PropertyChangeListener;
import java.beans.PropertyChangeSupport;
import java.io.Serializable;
import java.net.URLConnection;

import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.util.EncodingChangeException;

/**
 * Extract nodes from a URL using a filter.
 * 
 * <pre>
 * <code>
 * FilterBean fb = new FilterBean(&quot;http://cbc.ca&quot;);
 * fb.setFilters(new NodeFilter[] { new TagNameFilter(&quot;META&quot;) });
 * fb.setURL(&quot;http://cbc.ca&quot;);
 * System.out.println(fb.getNodes().toHtml());
 * </code>
 * </pre>
 */
public class FilterBean implements Serializable {
	/**
	 * Property name in event where the URL contents changes.
	 */
	public static final String PROP_NODES_PROPERTY = "nodes";

	/**
	 * Property name in event where the URL contents changes.
	 */
	public static final String PROP_TEXT_PROPERTY = "text";

	/**
	 * Property name in event where the URL changes.
	 */
	public static final String PROP_URL_PROPERTY = "URL";

	/**
	 * Property name in event where the connection changes.
	 */
	public static final String PROP_CONNECTION_PROPERTY = "connection";

	/**
	 * Bound property support.
	 */
	protected PropertyChangeSupport mPropertySupport;

	/**
	 * The parser used to filter.
	 */
	protected Parser mParser;

	/**
	 * The filter set.
	 */
	protected NodeFilter[] mFilters;

	/**
	 * The nodes extracted from the URL.
	 */
	protected NodeList mNodes;

	/**
	 * The recursion behaviour for elements of the filter array. If
	 * <code>true</code> the filters are applied recursively.
	 * 
	 * @see org.htmlparser.util.NodeList#extractAllNodesThatMatch(NodeFilter,
	 *      boolean).
	 */
	protected boolean mRecursive;

	/**
	 * Create a FilterBean object.
	 */
	public FilterBean() {
		mPropertySupport = new PropertyChangeSupport(this);
		mParser = new Parser();
		mFilters = null;
		mNodes = null;
		mRecursive = true;
	}

	//
	// internals
	//

	/**
	 * Assign the <code>Nodes</code> property, firing the property change.
	 * 
	 * @param nodes
	 *            The new value of the <code>Nodes</code> property.
	 */
	protected void updateNodes(NodeList nodes) {
		NodeList oldValue;
		String oldText;
		String newText;

		if ((null == mNodes) || !mNodes.equals(nodes)) {
			oldValue = mNodes;
			if (null != oldValue)
				oldText = getText();
			else
				oldText = "";
			if (null == oldText)
				oldText = "";
			mNodes = nodes;
			if (null != mNodes) // TODO: fix this null problem
				newText = getText();
			else
				// StringBean finds no nodes
				newText = "";
			if (null == newText)
				newText = "";
			mPropertySupport.firePropertyChange(PROP_NODES_PROPERTY, oldValue,
					nodes);
			if (!newText.equals(oldText))
				mPropertySupport.firePropertyChange(PROP_TEXT_PROPERTY,
						oldText, newText);
		}
	}

	/**
	 * Apply each of the filters. The first filter is applied to the output of
	 * the parser. Subsequent filters are applied to the output of the prior
	 * filter.
	 * 
	 * @return A list of nodes passed through all filters. If there are no
	 *         filters, returns the entire page.
	 * @throws ParserException
	 *             If an encoding change occurs or there is some other problem.
	 */
	protected NodeList applyFilters() throws ParserException {
		NodeFilter[] filters;
		NodeList ret;

		ret = mParser.parse(null);
		filters = getFilters();
		if (null != filters)
			for (int i = 0; i < filters.length; i++)
				ret = ret.extractAllNodesThatMatch(filters[i], mRecursive);

		return (ret);
	}

	/**
	 * Fetch the URL contents and filter it. Only do work if there is a valid
	 * parser with it's URL set.
	 */
	protected void setNodes() {
		NodeList list;

		if (null != getURL())
			try {
				list = applyFilters();
				updateNodes(list);
			} catch (EncodingChangeException ece) {
				try { // try again with the encoding now in force
					mParser.reset();
					list = applyFilters();
					updateNodes(list);
				} catch (ParserException pe) {
					updateNodes(new NodeList());
				}
			} catch (ParserException pe) {
				updateNodes(new NodeList());
			}
	}

	//
	// Property change support.
	//

	/**
	 * Add a PropertyChangeListener to the listener list. The listener is
	 * registered for all properties.
	 * 
	 * @param listener
	 *            The PropertyChangeListener to be added.
	 */
	public void addPropertyChangeListener(PropertyChangeListener listener) {
		mPropertySupport.addPropertyChangeListener(listener);
	}

	/**
	 * Remove a PropertyChangeListener from the listener list. This removes a
	 * registered PropertyChangeListener.
	 * 
	 * @param listener
	 *            The PropertyChangeListener to be removed.
	 */
	public void removePropertyChangeListener(PropertyChangeListener listener) {
		mPropertySupport.removePropertyChangeListener(listener);
	}

	//
	// Properties
	//

	/**
	 * Return the nodes of the URL matching the filter. This is the primary
	 * output of the bean.
	 * 
	 * @return The nodes from the URL matching the current filter.
	 */
	public NodeList getNodes() {
		if (null == mNodes)
			setNodes();

		return (mNodes);
	}

	/**
	 * Get the current URL.
	 * 
	 * @return The URL from which text has been extracted, or <code>null</code>
	 *         if this property has not been set yet.
	 */
	public String getURL() {
		return ((null != mParser) ? mParser.getURL() : null);
	}

	/**
	 * Set the URL to extract strings from. The text from the URL will be
	 * fetched, which may be expensive, so this property should be set last.
	 * 
	 * @param url
	 *            The URL that text should be fetched from.
	 */
	public void setURL(String url) {
		String old;
		URLConnection conn;

		old = getURL();
		conn = getConnection();
		if (((null == old) && (null != url))
				|| ((null != old) && !old.equals(url))) {
			try {
				if (null == mParser)
					mParser = new Parser(url);
				else
					mParser.setURL(url);
				mPropertySupport.firePropertyChange(PROP_URL_PROPERTY, old,
						getURL());
				mPropertySupport.firePropertyChange(PROP_CONNECTION_PROPERTY,
						conn, mParser.getConnection());
				setNodes();
			} catch (ParserException pe) {
				updateNodes(new NodeList());
			}
		}
	}

	/**
	 * Get the current connection.
	 * 
	 * @return The connection that the parser has or <code>null</code> if it
	 *         hasn't been set or the parser hasn't been constructed yet.
	 */
	public URLConnection getConnection() {
		return ((null != mParser) ? mParser.getConnection() : null);
	}

	/**
	 * Set the parser's connection. The text from the URL will be fetched, which
	 * may be expensive, so this property should be set last.
	 * 
	 * @param connection
	 *            New value of property Connection.
	 */
	public void setConnection(URLConnection connection) {
		String url;
		URLConnection conn;

		url = getURL();
		conn = getConnection();
		if (((null == conn) && (null != connection))
				|| ((null != conn) && !conn.equals(connection))) {
			try {
				if (null == mParser)
					mParser = new Parser(connection);
				else
					mParser.setConnection(connection);
				mPropertySupport.firePropertyChange(PROP_URL_PROPERTY, url,
						getURL());
				mPropertySupport.firePropertyChange(PROP_CONNECTION_PROPERTY,
						conn, mParser.getConnection());
				setNodes();
			} catch (ParserException pe) {
				updateNodes(new NodeList());
			}
		}
	}

	/**
	 * Get the current filter set.
	 * 
	 * @return The current filters.
	 */
	public NodeFilter[] getFilters() {
		return (mFilters);
	}

	/**
	 * Set the filters for the bean. If the parser has been set, it is reset and
	 * the nodes are refetched with the new filters.
	 * 
	 * @param filters
	 *            The filter set to use.
	 */
	public void setFilters(NodeFilter[] filters) {
		mFilters = filters;
		if (null != getParser()) {
			getParser().reset();
			setNodes();
		}
	}

	/**
	 * Get the parser used to fetch nodes.
	 * 
	 * @return The parser used by the bean.
	 */
	public Parser getParser() {
		return (mParser);
	}

	/**
	 * Set the parser for the bean. The parser is used immediately to fetch the
	 * nodes, which for a null filter means all the nodes
	 * 
	 * @param parser
	 *            The parser to use.
	 */
	public void setParser(Parser parser) {
		mParser = parser;
		if (null != getFilters())
			setNodes();
	}

	/**
	 * Convenience method to apply a {@link StringBean} to the filter results.
	 * This may yield duplicate or multiple text elements if the node list
	 * contains nodes from two or more levels in the same nested tag heirarchy,
	 * but if the node list contains only one tag, it provides access to the
	 * text within the node.
	 * 
	 * @return The textual contents of the nodes that pass through the filter
	 *         set, as collected by the StringBean.
	 */
	public String getText() {
		NodeList list;
		StringBean sb;
		String ret;

		list = getNodes();
		if (0 != list.size()) {
			sb = new StringBean();
			for (int i = 0; i < list.size(); i++)
				list.elementAt(i).accept(sb);
			ret = sb.getStrings();
		} else
			ret = "";

		return (ret);
	}

	/**
	 * Get the current recursion behaviour.
	 * 
	 * @return The recursion (applies to children, children's children, etc)
	 *         behavior currently being used.
	 */
	public boolean getRecursive() {
		return (mRecursive);
	}

	/**
	 * Set the recursion behaviour.
	 * 
	 * @param recursive
	 *            If <code>true</code> the
	 *            <code>extractAllNodesThatMatch()</code> call is performed
	 *            recursively.
	 * @see org.htmlparser.util.NodeList#extractAllNodesThatMatch(NodeFilter,
	 *      boolean).
	 */
	public void setRecursive(boolean recursive) {
		mRecursive = recursive;
	}

	/**
	 * Unit test.
	 * 
	 * @param args
	 *            Pass arg[0] as the URL to process, and optionally a node name
	 *            for filtering.
	 */
	public static void main(String[] args) {
		if (0 >= args.length)
			System.out
					.println("Usage: java -classpath htmlparser.jar org.htmlparser.beans.FilterBean <http://whatever_url> [node name]");
		else {
			FilterBean fb = new FilterBean();
			if (1 < args.length)
				fb
						.setFilters(new NodeFilter[] { new org.htmlparser.filters.TagNameFilter(
								args[1]) });
			fb.setURL(args[0]);
			// System.out.println (fb.getNodes ().toHtml ());
			System.out.println(fb.getText());
		}
	}
}
